##############################
# Media Measurement Matters  #
# Replication Code           #
# Preparation of Web Data    #
##############################

# The following file includes the steps for pre-processing the web-browsing data.
# Note that, in order to protect respondents' privacy, we do not supply the list of
# URLs visited by each respondent, as such URLs may contain identifying information. 
# However, in all cases, we provide all unique news domain visits for each respondent,
# which is the level at which we conduct the majority of our analyses.

# Set-up ----

# Clear out environment
rm(list=ls())

# Load libraries
library(tidyverse)
library(urltools)

# Set up helper operations
`%notin%` <- Negate(`%in%`)

# Read in web-browsing data (full set of site visits)
web <- read_rds("data/web_raw.rds")
dim(web)

# Remove sequential duplicates ----

# Filter out cases of sequential duplicates
web_nodupe <- web %>% 
  filter(dupe == 0)

# Total number of news visits after removing sequential duplicates
(n_urls_nodupes <- nrow(web_nodupe))

# Pre-process BMA data (Appendix D) ----

# Load in BMA scores for top 500 domains
bma <- read_csv("data/bakshy_top500.csv")

# Remove www. prefix from all domains for comparison to comScore
bma <- bma %>% 
  mutate(domain_recode = case_when(!str_detect(domain, "www") ~ domain,
                                   TRUE ~ gsub(x = domain, pattern = "www.(.*)",
                                               replacement = "\\1")))

# Identify duplicate domains, select www. version to remove
domains_to_exclude <- bma %>% 
  group_by(domain_recode) %>% 
  filter(n() > 1) %>% 
  arrange(domain_recode) %>% 
  filter(str_detect(domain, "www")) %>% 
  pull(domain)
  
# Remove duplicate domains
bma <- bma %>% 
  mutate(domain_recode = case_when(domain %in% domains_to_exclude ~ NA_character_,
                                   TRUE ~ domain_recode))

# > Identify number of exact domain matches ----

# Identify unique domains (and visits) in de-duplicated comScore data
unique_domain <- web_nodupe %>% 
  group_by(domain) %>% 
  tally() %>% 
  arrange(desc(n))

# Identify exact matches between comScore and BMA domains
unique_domain <- unique_domain %>% 
  mutate(exact = case_when(domain %in% bma$domain_recode ~ 1,
                           TRUE ~ 0))

sum(unique_domain$exact) # 190 exact matches
sum(unique_domain %>% filter(exact == 1) %>% pull(n)) / 
  sum(unique_domain %>% pull(n)) # 80.1% of news visits covered

# > Match after removing third-level domain information ----

# Identify domains that did not have exact match in comScore data
unmatched <- unique_domain %>% 
  filter(exact == 0)

# Remove third-level domain information
domain_stubs <- suffix_extract(domain(unmatched$domain))
unmatched$stub <- paste0(domain_stubs$domain, ".", domain_stubs$suffix)

# Identify matches between comScore and recoded BMA domains
unmatched <- unmatched %>% 
  mutate(stub_match = case_when(stub %in% bma$domain_recode ~ 1,
                                TRUE ~ 0))

# Spot check matches to remove false positives (i.e., international or 
# non-English-language domains)
false_pos <- c("cnnespanol.cnn.com", "russian.rt.com", "cn.nytimes.com",
               "digitaledition.napersun.chicagotribune.com", 
               "digitaledition.newssunonline.chicagotribune.com",
               "quebec.huffingtonpost.ca")

unmatched <- unmatched %>% 
  mutate(stub_match = case_when(domain %in% false_pos ~ 0,
                                TRUE ~ stub_match))

sum(unmatched$stub_match) # 344 additional matches
sum(unique_domain %>% filter(exact == 1) %>% pull(n),
    unmatched %>% filter(stub_match == 1) %>% pull(n)) / 
  sum(unique_domain %>% pull(n)) # 81.9% of news visits now covered

# > Manually check remaining domains ----

# Then, we manually reviewed the remaining domains to identify
# possible matches that had not been accounted for in our previous approach

manual <- read_csv("data/manual_coding_bma.csv")

# Identify matches between comScore and recoded BMA domains
manual <- manual %>% 
  mutate(man_match = case_when(recode %in% bma$domain_recode ~ 1,
                               TRUE ~ 0))

sum(manual$man_match) # 28 additional matches
sum(unique_domain %>% filter(exact == 1) %>% pull(n),
    unmatched %>% filter(stub_match == 1) %>% pull(n),
    manual %>% filter(man_match == 1) %>% pull(n)) / 
  sum(unique_domain %>% pull(n)) # 82.2% of news visits now covered

# Construct list of matched domains
domains <- unique_domain %>% 
  filter(exact == 1) %>% 
  mutate(domain_recode = domain) %>% 
  select(domain, domain_recode, n, match = exact) %>% 
  bind_rows(unmatched %>% 
              filter(stub_match == 1) %>% 
              select(domain, domain_recode = stub, n, match = stub_match)) %>% 
  bind_rows(manual %>% 
              select(domain, domain_recode = recode, n, match = man_match)) %>% 
  rename(orig_domain = domain) %>% 
  arrange(desc(n))

# Merge in BMA scores
domains <- domains %>% 
  left_join(bma %>% select(domain_recode, avg_align), by = "domain_recode")

# > Merge web data and BMA scores ----

# Join all data together and remove URLs without alignment scores
web_scored <- web_nodupe %>% 
  left_join(domains, by = c("domain" = "orig_domain"))

num_matched <- sum(!is.na(web_scored$avg_align))
perc_matched <- mean(!is.na(web_scored$avg_align))
dom_matched <- domains %>% 
  filter(match == 1) %>% 
  select(domain_recode) %>% 
  unique() %>% nrow()

num_matched; perc_matched; dom_matched

# > Identify number of hard news matches ----

# Identify "hard news" URL visits with alignment scores
web_scored_hard <- web_scored %>% 
  filter(hard_news == 1)

# Number of hard news domains without portals
web_scored_hard_np <- web_scored_hard %>% 
  filter(domain_recode %notin% c("aol.com", "msn.com", "google.com")) 
  
num_matched_hard_np <- sum(!is.na(web_scored_hard_np$avg_align))

# Examine number of hard news URLs
nrow(web_scored_hard) # Total number of hard news URLs
nrow(web_scored_hard %>% drop_na(avg_align)) # Hard news URLs with BMA score
num_matched_hard_np # Hard news URLs with BMA score, excluding portals
nrow(web_scored_hard)/nrow(web_scored) # Percent of news URLs predicted to be hard news

# Pre-process Eady et al. data (Appendix P) ----

# Load in Eady et al. data
eady <- read_csv("data/eady_media_scores.csv")

# > Identify number of exact domain matches ----

# Identify unique domains (and visits) in de-duplicated comScore data
unique_eady <- web_nodupe %>% 
  group_by(eady_domain) %>% 
  tally() %>% 
  arrange(desc(n))

# Identify exact matches between comScore and BMA domains
unique_eady <- unique_eady %>% 
  mutate(exact = case_when(eady_domain %in% eady$domain ~ 1,
                           TRUE ~ 0))

sum(unique_eady$exact) # 86 exact matches
sum(unique_eady %>% filter(exact == 1) %>% pull(n)) / 
  sum(unique_eady %>% pull(n)) # 59.1% of news visits covered

# > Match after removing third-level domain information ----

# Identify domains that did not have exact match in comScore data
unmatched_eady <- unique_eady %>% 
  filter(exact == 0)

# Remove third-level domain information
domain_stubs_eady <- suffix_extract(domain(unmatched_eady$eady_domain))
unmatched_eady$stub <- paste0(domain_stubs_eady$domain, ".", domain_stubs_eady$suffix)

# Identify matches between comScore and recoded BMA domains
unmatched_eady <- unmatched_eady %>% 
  mutate(stub = toupper(stub),
         stub_match = case_when(stub %in% eady$domain ~ 1,
                                TRUE ~ 0))

# Spot check matches to remove false positives (i.e., international or 
# non-English-language domains)
false_pos_eady <- c("CNNESPANOL.CNN.COM", "IR.VOANEWS.COM", "AMHARIC.VOANEWS.COM",
                    "RUSSIAN.RT.COM", "LEARNINGENGLISH.VOANEWS.COM", "CN.NYTIMES.COM")

unmatched_eady <- unmatched_eady %>% 
  mutate(stub_match = case_when(eady_domain %in% false_pos_eady ~ 0,
                                TRUE ~ stub_match))

# Change Conservative Tribune (CT.WESTERNJOURNAL.COM) to reflect correct domain
unmatched_eady <- unmatched_eady %>% 
  mutate(stub = case_when(eady_domain == "CT.WESTERNJOURNAL.COM" ~ "CONSERVATIVETRIBUNE.COM",
                          TRUE ~ stub))

sum(unmatched_eady$stub_match) # 159 additional matches
sum(unique_eady %>% filter(exact == 1) %>% pull(n),
    unmatched_eady %>% filter(stub_match == 1) %>% pull(n)) / 
  sum(unique_eady %>% pull(n)) # 60.8% of news visits now covered

# > Manually check remaining domains ----

# Then, we manually reviewed the remaining domains to identify
# possible matches that had not been accounted for in our previous approach

manual_eady <- read_csv("data/manual_coding_eady.csv")

# Identify matches between comScore and recoded BMA domains
manual_eady <- manual_eady %>% 
  mutate(man_match = case_when(recode %in% eady$domain ~ 1,
                               TRUE ~ 0))

sum(manual_eady$man_match) # 21 additional matches
sum(unique_eady %>% filter(exact == 1) %>% pull(n),
    unmatched_eady %>% filter(stub_match == 1) %>% pull(n),
    manual_eady %>% filter(man_match == 1) %>% pull(n)) / 
  sum(unique_eady %>% pull(n)) # 60.9% of news visits now covered

# Construct list of matched domains
domains_eady <- unique_eady %>% 
  filter(exact == 1) %>% 
  mutate(eady_domain_recode = eady_domain) %>% 
  select(eady_domain, eady_domain_recode, n, eady_match = exact) %>% 
  bind_rows(unmatched_eady %>% 
              filter(stub_match == 1) %>% 
              select(eady_domain, eady_domain_recode = stub, n, eady_match = stub_match)) %>% 
  bind_rows(manual_eady %>% 
              select(eady_domain = domain, eady_domain_recode = recode, n, eady_match = man_match)) %>% 
  rename(orig_eady_domain = eady_domain) %>% 
  arrange(desc(n))

# Merge in BMA scores
domains_eady <- domains_eady %>% 
  left_join(eady %>% select(domain, zeta), by = c("eady_domain_recode" = "domain"))

# > Merge web data and Eady et al. scores ----

# Join all data together and remove URLs without alignment scores
web_scored <- web_scored %>% 
  left_join(domains_eady %>% select(-c(n)), 
            by = c("eady_domain" = "orig_eady_domain"))

eady_matched <- sum(!is.na(web_scored$zeta))
eady_perc_matched <- mean(!is.na(web_scored$zeta))
eady_cs_matched <- domains_eady %>% drop_na(zeta) %>% nrow()
eady_matched_domains <- domains_eady %>% 
  filter(eady_match == 1) %>% 
  select(eady_domain_recode) %>% 
  unique() %>% nrow()

# Descriptives for Eady et al. scores
eady_matched # Number of URL visits with Eady et al. score
eady_perc_matched # Percent of URL visits with Eady et al. score
eady_cs_matched # Number of comScore domains matched to Eady et al. scores
eady_matched_domains # Number of Eady et al. scores matched to comScore data

# Save cleaned data ----

write_rds(web_scored, "data/web_data.rds")
write_csv(web_scored, "data/web_data.csv")
